Carregando bibliotecas
library(DT)
library(ROCR)
library(MASS)
library(caret)
library(dplyr)
library(stats)
library(plotly)
library(readxl)
library(caTools)
library(ggplot2)
library(corrplot)
library(varhandle)
set.seed(12)
# Leitura e Filtragem dos dados
df <- iris
especies_b <- to.dummy(iris$Species, "species") %>% as.data.frame()
especies <- especies_b$species.virginica
df <- data.frame(df[, 1:4], especies) %>%
dplyr::mutate(especies = as.factor(especies))
# Divisão dos dados para treino e teste - 90/10
divisao_df = sample.split(df$especies, SplitRatio = 0.90)
df_treino = subset(df, divisao_df == TRUE)
df_teste = subset(df, divisao_df == FALSE)
# Gráficos de dispersão
p <- df_treino %>%
ggplot(aes(x = Petal.Length, y = Petal.Width, color = especies)) +
geom_point() +
ggtitle("DF Treino")
ggplotly(p)
p <- df_teste %>%
ggplot(aes(x = Petal.Length, y = Petal.Width, color = especies)) +
geom_point() +
ggtitle("DF Teste")
ggplotly(p)
# Construção do modelo
modelo <- glm("especies ~ .", data = df_treino, family = "binomial")
modelo
##
## Call: glm(formula = "especies ~ .", family = "binomial", data = df_treino)
##
## Coefficients:
## (Intercept) Sepal.Length Sepal.Width Petal.Length Petal.Width
## -53.280 -1.182 -4.318 9.238 16.313
##
## Degrees of Freedom: 134 Total (i.e. Null); 130 Residual
## Null Deviance: 171.9
## Residual Deviance: 10.94 AIC: 20.94
# Predições
predicoes_teste <- predict(modelo, df_teste, type = "response") %>%
as.numeric() %>%
round()
predicoes_teste
## [1] 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
flor1 <- data.frame(Sepal.Length = 6.4, Sepal.Width = 2.8, Petal.Length = 4.6, Petal.Width = 1.8)
flor2 <- data.frame(Sepal.Length = 6.3, Sepal.Width = 2.5, Petal.Length = 4.1, Petal.Width = 1.7)
predict(modelo, flor1, type = "response") %>%
as.numeric() %>%
round()
## [1] 0
predict(modelo, flor2, type = "response") %>%
as.numeric() %>%
round()
## [1] 0
confusionMatrix(data = as.factor(predicoes_teste),
reference = as.factor(df_teste$especies),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 10 1
## 1 0 4
##
## Accuracy : 0.9333
## 95% CI : (0.6805, 0.9983)
## No Information Rate : 0.6667
## P-Value [Acc > NIR] : 0.01941
##
## Kappa : 0.8421
##
## Mcnemar's Test P-Value : 1.00000
##
## Sensitivity : 0.8000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9091
## Prevalence : 0.3333
## Detection Rate : 0.2667
## Detection Prevalence : 0.2667
## Balanced Accuracy : 0.9000
##
## 'Positive' Class : 1
##
set.seed(100)
# Leitura
df <- read.csv("../inputs/credit_dataset.csv")
# Normalização e Conversão para factor
df[["credit.duration.months"]] <- scale(df[["credit.duration.months"]], center = T, scale = T)
df[["age"]] <- scale(df[["age"]], center = T, scale = T)
df[["credit.amount"]] <- scale(df[["credit.amount"]], center = T, scale = T)
var_factors <- c('credit.rating', 'account.balance', 'previous.credit.payment.status',
'credit.purpose', 'savings', 'employment.duration', 'installment.rate',
'marital.status', 'guarantor', 'residence.duration', 'current.assets',
'other.credits', 'apartment.type', 'bank.credits', 'occupation',
'dependents', 'telephone', 'foreign.worker')
for(i in var_factors) {
df[[i]] <- as.factor(df[[i]])
}
# Divisão em conjunto de teste e treino 60/40
divisao_df = sample.split(df$credit.rating, SplitRatio = 0.60)
df_treino = subset(df, divisao_df == TRUE)
df_teste = subset(df, divisao_df == FALSE)
# Criação do Modelo
modelo <- glm("credit.rating ~ .", data = df_treino, family = "binomial")
summary(modelo)
##
## Call:
## glm(formula = "credit.rating ~ .", family = "binomial", data = df_treino)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4823 -0.6585 0.3899 0.7028 2.3138
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.03367 1.09715 0.942 0.346120
## account.balance2 0.42494 0.28978 1.466 0.142532
## account.balance3 1.68601 0.28461 5.924 3.14e-09 ***
## credit.duration.months -0.18457 0.14413 -1.281 0.200327
## previous.credit.payment.status2 0.83888 0.41288 2.032 0.042175 *
## previous.credit.payment.status3 1.75151 0.44006 3.980 6.89e-05 ***
## credit.purpose2 -1.10025 0.49904 -2.205 0.027472 *
## credit.purpose3 -1.33254 0.48031 -2.774 0.005532 **
## credit.purpose4 -1.76621 0.47177 -3.744 0.000181 ***
## credit.amount -0.34239 0.16305 -2.100 0.035734 *
## savings2 0.73669 0.39983 1.843 0.065400 .
## savings3 1.29027 0.48161 2.679 0.007383 **
## savings4 0.64672 0.32181 2.010 0.044471 *
## employment.duration2 0.35496 0.31163 1.139 0.254682
## employment.duration3 0.77129 0.38109 2.024 0.042978 *
## employment.duration4 0.40428 0.36096 1.120 0.262712
## installment.rate2 0.13630 0.39674 0.344 0.731193
## installment.rate3 -0.60469 0.43670 -1.385 0.166150
## installment.rate4 -0.38190 0.38414 -0.994 0.320138
## marital.status3 0.24796 0.26044 0.952 0.341054
## marital.status4 0.70870 0.41647 1.702 0.088816 .
## guarantor2 0.46403 0.35729 1.299 0.194029
## residence.duration2 -0.37545 0.36971 -1.016 0.309857
## residence.duration3 -0.56620 0.41439 -1.366 0.171826
## residence.duration4 -0.13539 0.37974 -0.357 0.721433
## current.assets2 -0.23866 0.32961 -0.724 0.469016
## current.assets3 -0.35931 0.30423 -1.181 0.237589
## current.assets4 -0.50886 0.53965 -0.943 0.345709
## age 0.08214 0.13168 0.624 0.532774
## other.credits2 0.13490 0.29385 0.459 0.646182
## apartment.type2 0.67322 0.31034 2.169 0.030062 *
## apartment.type3 0.47768 0.60814 0.785 0.432179
## bank.credits2 -0.19551 0.30245 -0.646 0.517995
## occupation2 -1.31950 0.86268 -1.530 0.126134
## occupation3 -1.65837 0.83024 -1.997 0.045777 *
## occupation4 -1.48500 0.87965 -1.688 0.091378 .
## dependents2 -0.22212 0.33562 -0.662 0.508086
## telephone2 0.36689 0.26467 1.386 0.165685
## foreign.worker2 2.35080 0.90337 2.602 0.009262 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 733.04 on 599 degrees of freedom
## Residual deviance: 540.02 on 561 degrees of freedom
## AIC: 618.02
##
## Number of Fisher Scoring iterations: 5
# Predição DF Teste, Matriz de confusão
predi_teste <- predict(modelo, df_teste, type = "response") %>%
as.numeric() %>%
round() %>%
as.factor()
confusionMatrix(data = predi_teste,
reference = as.factor(df_teste$credit.rating),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 64 44
## 1 56 236
##
## Accuracy : 0.75
## 95% CI : (0.7046, 0.7917)
## No Information Rate : 0.7
## P-Value [Acc > NIR] : 0.01553
##
## Kappa : 0.3873
##
## Mcnemar's Test P-Value : 0.27133
##
## Sensitivity : 0.8429
## Specificity : 0.5333
## Pos Pred Value : 0.8082
## Neg Pred Value : 0.5926
## Prevalence : 0.7000
## Detection Rate : 0.5900
## Detection Prevalence : 0.7300
## Balanced Accuracy : 0.6881
##
## 'Positive' Class : 1
##